TIPS
Write commands in code editor of R Studio and run them using icon
Run in R Studio.
We need several R packages for our analysis today. Some are installed through bioconductor, some from CRAN through install.packages.
Bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
Seurat
BiocManager::install('multtest', update=F)
install.packages('Seurat')
Other CRAN packages
install.packages('Matrix')
install.packages('dplyr')
install.packages('reshape2')
install.packages('fossil')
Bioconductor packages
BiocManager::install("ComplexHeatmap", update=F)
BiocManager::install("monocle", update=F)
Check installations
library(Seurat)
library(Matrix)
library(dplyr)
library(reshape2)
library(fossil)
library(ComplexHeatmap)
library(monocle)
Open this link in a web browser: https://wd.cri.uic.edu/sc_rna/10X_example_report.html
Practice with inDrop data (full matrix) and 10X data (sparse matrix)
library(Seurat)
library(Matrix)
library(dplyr)
counts_indrop <- read.delim("https://wd.cri.uic.edu/sc_rna/inDrop_counts.txt",
row.names=1)
sparse_indrop <- Matrix(as.matrix(counts_indrop),sparse=T)
format(object.size(counts_indrop), units="auto")
[1] "46.8 Mb"
format(object.size(sparse_indrop), units="auto")
[1] "13.4 Mb"
seurat_indrop <- CreateSeuratObject(counts=sparse_indrop, project="indrop_sample_data")
seurat_indrop
An object of class Seurat
9435 features across 1275 samples within 1 assay
Active assay: RNA (9435 features, 0 variable features)
1 layer present: counts
Downloads folder, execute the following.sparse_10X <- Read10X("~/Downloads/10X_test/filtered_feature_bc_matrix/",
gene.column=1)
Downloads folder, execute the
following.sparse_10X <- Read10X("~/../Downloads/10X_test/filtered_feature_bc_matrix/",
gene.column=1)seurat_10X <- CreateSeuratObject(counts=sparse_10X, project="10X_sample_data")
seurat_10X
An object of class Seurat
31053 features across 1301 samples within 1 assay
Active assay: RNA (31053 features, 0 variable features)
1 layer present: counts
ABOUT: These are two samples from a recently published study, Martos SN, Campbell MR, Lozoya OA, Wang X et al. Single-cell analyses identify dysfunctional CD16+ CD8 T cells in smokers. Cell Rep Med 2020 Jul 21;1(4). PMID: 33163982.
Downloads folder, execute the following.sample1 <- Read10X("~/Downloads/V035_F031_subsample", gene.column=1)
sample2 <- Read10X("~/Downloads/V039_F093_subsample", gene.column=1)
Downloads folder, execute the
following.sample1 <- Read10X("~/../Downloads/V035_F031_subsample", gene.column=1)
sample2 <- Read10X("~/../Downloads/V039_F093_subsample", gene.column=1)seurat1 <- CreateSeuratObject(counts=sample1, project="sample1")
seurat1
An object of class Seurat
32738 features across 1629 samples within 1 assay
Active assay: RNA (32738 features, 0 variable features)
1 layer present: counts
seurat2 <- CreateSeuratObject(counts=sample2, project="sample2")
seurat2
An object of class Seurat
32738 features across 1334 samples within 1 assay
Active assay: RNA (32738 features, 0 variable features)
1 layer present: counts
add.cell.ids parameter specifies a prefix to add to the
cell IDs when combining the data.sc_data <- merge(seurat1, y=seurat2, add.cell.ids=c("s1","s2"))
sc_data
An object of class Seurat
32738 features across 2963 samples within 1 assay
Active assay: RNA (32738 features, 0 variable features)
2 layers present: counts.sample1, counts.sample2
# run JoinLayers to combine the tables for the different samples
# this is a new step for Seurat v5
sc_data <- JoinLayers(sc_data)
orig.ident is how we’re keeping
track of which file came from which samplehead(sc_data$orig.ident)
s1_AAACCTGAGCACCGTC-1 s1_AAACCTGAGCGTGAAC-1 s1_AAACCTGGTAGAGCTG-1
"sample1" "sample1" "sample1"
s1_AAACCTGGTCTAGCGC-1 s1_AAACCTGGTGTGGCTC-1 s1_AAACGGGAGTGTACGG-1
"sample1" "sample1" "sample1"
table()
functiontable(sc_data$orig.ident)
sample1 sample2
1629 1334
genes <- read.delim("https://wd.cri.uic.edu/sc_rna/hg19_genes.bed")
dim(genes)
[1] 43318 6
head(genes)
chrX X99883667 X99894988 ENSG00000000003 X0 X.
1 chrX 99839799 99854882 ENSG00000000005 0 +
2 chr20 49551404 49575092 ENSG00000000419 0 -
3 chr1 169818772 169863408 ENSG00000000457 0 -
4 chr1 169631245 169823221 ENSG00000000460 0 +
5 chr1 27938575 27961788 ENSG00000000938 0 -
6 chr1 196621008 196716634 ENSG00000000971 0 +
mt.genes <- as.character(genes[genes[,1]=="chrM", 4])
length(mt.genes)
[1] 13
sc_data[["percent.MT"]] <- PercentageFeatureSet(sc_data, features=mt.genes)
NOTE: There are a few ways to get a list of the mitochondrial gene IDs for your genome.
https://genome.ucsc.edu/cgi-bin/hgTables, just make sure to
get the one that matches the gene IDs in your file.VlnPlot(sc_data, features = c("nFeature_RNA","nCount_RNA","percent.MT"), pt.size=0.2)
Warning: Default search for "data" layer in "RNA" assay yielded no results;
utilizing "counts" layer instead.
5. Make scatterplot of number of UMI counts vs. number of genes
FeatureScatter(sc_data, feature1="nCount_RNA", feature2="nFeature_RNA")
FeatureScatter(sc_data, feature1="nCount_RNA", feature2="percent.MT")
7. Make scatterplot of number of genes vs. pecent mitochondrial
expression
FeatureScatter(sc_data, feature1="nFeature_RNA", feature2="percent.MT")
Based on the violin plots and scatterplots, decide on a reasonable threshold for filtering out the “bad” cells.
For today, our passing cells must have:
sc_subset <- subset(sc_data,
nFeature_RNA>500 & nCount_RNA>2000 & percent.MT < 10)
sc_subset
An object of class Seurat
32738 features across 2791 samples within 1 assay
Active assay: RNA (32738 features, 0 variable features)
1 layer present: counts
table()
functionorig.counts <- table(sc_data$orig.ident)
subset.counts <- table(sc_subset$orig.ident)
cell_stats <- cbind(orig.counts, subset.counts, subset.counts/orig.counts)
colnames(cell_stats) <- c("Starting Cells","Retained Cells","Fraction")
cell_stats
Starting Cells Retained Cells Fraction
sample1 1629 1523 0.9349294
sample2 1334 1268 0.9505247
sc_subset <- NormalizeData(sc_subset)
Normalizing layer: counts
sc_subset <- FindVariableFeatures(sc_subset, nfeatures=4000)
Finding variable features for layer counts
VariableFeaturePlot(sc_subset)
Warning in scale_x_log10(): log-10 transformation introduced infinite values.
ScaleData function will only scale the features that are
identified as variable via the FindVariableFeatures()
function.sc_subset <- ScaleData(sc_subset)
Centering and scaling data matrix
npcs = 50 is the default. You may want to
increase for more complex data sets. Also set “verbose=F”
otherwise Seurat will print out a number of statistics to the
console.sc_subset <- RunPCA(sc_subset, features=VariableFeatures(object=sc_subset),
npcs = 50, verbose=F)
ElbowPlot(sc_subset, ndims=24)
balanced=T). We want to examine each PC’s heatmap to see
at what point the “signal” starts to go away.NOTE: If you get warnings about figure margins too large, try making the plot pane in your R studio bigger
DimHeatmap(sc_subset, dims=1:24, cells=300, balanced=T)
DO NOT RUN THESE STEPS TODAY as they are computationally demanding (this exercise took ~9 minutes on my computer). We will review the commands and results in the exercise handout.
JackStraw is a method for computing p-values for principal components. As implemented in Seurat, the steps are:
JackStraw() computes projected PCA
scores for a random permutation of a subset of the data, and compares
these values to the observed PCA scores for the genes. This allows the
computation of a p-value for the association of each gene with each
PC.ScoreJackStraw() computes p-values for
the PCs based on the distribution of p-values obtained for all genes
against that PC from the JackStraw step.JackStrawPlot() makes a diagnostic
plot we can use to evaluate the signal level and significance of each
PC.NOTE: The selection of the number of PCs to use (dims parameter) is different for the different functions. For JackStraw() you just give a number (e.g., 24), and for the other two you give a range (e.g., 1:24).
sc_subset = JackStraw(sc_subset, num.replicate = 100, dims = 24)
sc_subset = ScoreJackStraw(sc_subset, dims = 1:24)
JackStrawPlot(sc_subset, dims = 1:24)
These are Q-Q plots: PC curves that are more diverged from the black dashed diagonal line have a higher level of signal. P-values are given in the legend.
FindNeighbors(). For this exercise we will
use the first 20 PCs to determine the distance between the cells and
thus who are the “neighbors”pca.dims = 1:20
sc_subset <- FindNeighbors(sc_subset, dims=pca.dims)
Computing nearest neighbor graph
Computing SNN
sc_subset <- FindClusters(sc_subset, resolution=0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2791
Number of edges: 110384
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8599
Number of communities: 7
Elapsed time: 0 seconds
sc_subset <- RunUMAP(sc_subset, dims=pca.dims)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
20:44:33 UMAP embedding parameters a = 0.9922 b = 1.112
20:44:33 Read 2791 rows and found 20 numeric columns
20:44:33 Using Annoy for neighbor search, n_neighbors = 30
20:44:33 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
20:44:33 Writing NN index file to temp file /tmp/RtmpD9QPiA/file28ac0550b621d
20:44:33 Searching Annoy index using 1 thread, search_k = 3000
20:44:33 Annoy recall = 100%
20:44:34 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
20:44:34 Initializing from normalized Laplacian + noise (using RSpectra)
20:44:34 Commencing optimization for 500 epochs, with 115030 positive edges
20:44:37 Optimization finished
DimPlot(sc_subset, reduction='umap', label=T)
DimPlot(sc_subset, reduction='umap', group.by="orig.ident")
We will try two strategies:
We have made a small list of a few CD markers and which cell type(s) they are associated with for common immune cell types. In practice, you would want to derive your own cell marker list based on the types of cells you anticipate finding in your data set, the level of specificity you’re looking for in defining cell types (e.g., generic T cell, vs T helper, T reg, T responder, etc.), and what markers you consider to be specific and informative for those cell types.
marker_list <- read.delim("https://wd.cri.uic.edu/sc_rna/human-blood-markers.txt")
marker_list
Marker Cell.Type
1 CD14 Monocyte
2 CD163 Monocyte
3 CD244 Monocyte
4 CD28 B cell
5 CD38 B cell
6 CD86 B cell
7 CD19 B cell, NK cell
8 CD2 NK cell
9 CD3D NK cell, T cell
10 CD3E NK cell, T cell
11 CD3G NK cell, T cell
12 CD8A T cell
genenames <- read.delim("~/Downloads/V035_F031_subsample/features.tsv.gz",
row.names=1)
make.unique() to deal with duplicate gene
symbolssymbols_to_ids <- rownames(genenames)
names(symbols_to_ids) <- make.unique(genenames[,1])
marker_list$ID <- symbols_to_ids[marker_list$Marker]
marker_list
Marker Cell.Type ID
1 CD14 Monocyte ENSG00000170458
2 CD163 Monocyte ENSG00000177575
3 CD244 Monocyte ENSG00000122223
4 CD28 B cell ENSG00000178562
5 CD38 B cell ENSG00000004468
6 CD86 B cell ENSG00000114013
7 CD19 B cell, NK cell ENSG00000177455
8 CD2 NK cell ENSG00000116824
9 CD3D NK cell, T cell ENSG00000167286
10 CD3E NK cell, T cell ENSG00000198851
11 CD3G NK cell, T cell ENSG00000160654
12 CD8A T cell ENSG00000153563
Use DotPlot from Seurat to look at expression levels per cluster.
DotPlot(sc_subset, features=marker_list$ID, group.by="RNA_snn_res.0.5" ) + RotatedAxis()
The DotPlot would be more useful if we could label the x-axis with gene symbols and cell type descriptions. Let’s do that.
rownames(marker_list) = marker_list$ID
marker_list$description = paste(marker_list$Cell.Type, marker_list$Marker, sep=": ")
marker_list
Marker Cell.Type ID description
ENSG00000170458 CD14 Monocyte ENSG00000170458 Monocyte: CD14
ENSG00000177575 CD163 Monocyte ENSG00000177575 Monocyte: CD163
ENSG00000122223 CD244 Monocyte ENSG00000122223 Monocyte: CD244
ENSG00000178562 CD28 B cell ENSG00000178562 B cell: CD28
ENSG00000004468 CD38 B cell ENSG00000004468 B cell: CD38
ENSG00000114013 CD86 B cell ENSG00000114013 B cell: CD86
ENSG00000177455 CD19 B cell, NK cell ENSG00000177455 B cell, NK cell: CD19
ENSG00000116824 CD2 NK cell ENSG00000116824 NK cell: CD2
ENSG00000167286 CD3D NK cell, T cell ENSG00000167286 NK cell, T cell: CD3D
ENSG00000198851 CD3E NK cell, T cell ENSG00000198851 NK cell, T cell: CD3E
ENSG00000160654 CD3G NK cell, T cell ENSG00000160654 NK cell, T cell: CD3G
ENSG00000153563 CD8A T cell ENSG00000153563 T cell: CD8A
labels() parameter
in scale_x_discrete lets us change x-tic labels. We’re making a function
that will return the description column of marker_list, given the
Ensembl IDlibrary(ggplot2)
DotPlot(sc_subset, features=marker_list$ID, group.by="RNA_snn_res.0.5" ) +
RotatedAxis() +
scale_x_discrete(labels=function(x) marker_list[x,"description"])
We may also find it useful to look at the expression of these markers on the UMAP plot. Here’s how we can use Seurat’s FeaturePlot function, but add in our own custom labels:
library(cowplot)
# run FeaturePlot with combine=F so we get all separate plots
# 'myplots' will be a list of ggplot objects
myplots <- FeaturePlot(sc_subset, rownames(marker_list), combine=F)
# for each plot in the list, add a new title with the description
for(i in 1:length(myplots)){
myplots[[i]] = myplots[[i]] + ggtitle(marker_list[i,"description"])
}
# then make a new combined plot with plot_grid
plot_grid(plotlist=myplots)
At this point, you will need to examine the expression of markers across clusters manually decide on cell types.
We need an external single-cell data set whose cell types we trust. This is typically from published data sets, and we need to find a source where the authors have published not only the gene expression matrix files (as required), but also the cell type metadata (not always included).
Then you would load these files into a Seurat object, run normalization as above, and add the cell types to the single-cell metadata.
For this exercise, we are providing data from:
Single-cell RNA-Seq analysis reveals cell subsets and gene signatures associated with rheumatoid arthritis disease activity, Binvignat et al., JCI Insight, 2024 Aug 22; 9(16): e178499.
# read in Seurat object with data
sc_ref <- readRDS(url("https://wd.cri.uic.edu/sc_rna/sc_reference.rds"))
# run normalization and find variable features
sc_ref <- NormalizeData(sc_ref)
sc_ref <- FindVariableFeatures(sc_ref, nfeatures=4000)
# confirm that the gene identifiers match, and subset to genes in common
head(rownames(sc_ref))
[1] "ENSG00000186092" "ENSG00000238009" "ENSG00000239945" "ENSG00000241599"
[5] "ENSG00000229905" "ENSG00000237491"
head(rownames(sc_subset))
[1] "ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009"
[5] "ENSG00000239945" "ENSG00000237683"
common.genes <- intersect(rownames(sc_ref),rownames(sc_subset))
length(common.genes)
[1] 20667
# cell types we can transfer
table(sc_ref@meta.data$rough_annot)
Bcells CD4Tcells CD8Tcells Monocytes NKcells
5776 21333 3142 9497 3846
table(sc_ref@meta.data$fine_annot)
CD4 T central memory CD4 T effector memory CD4 T IFIT
8739 5049 239
CD4 T Naive yd T cells CD8 T early Tem
6724 582 915
CD8 T Naive CD8 TEMRA Classical Monocytes
1164 1063 3653
IFITM3 Monocytes IL1b-Monocytes Myeloid DCs
246 2280 2429
Non-classical Monocytes NKCD56bright NKCD56low
889 3039 807
Memory Bcells Naive Bcells Plasmablasts
2462 2739 575
We will use the rough_annot for now, but you can repeat
the same process with fine_annot if you want.
# run the label transfer (transfer anchors) with Seurat
# NOTE: this took ~1.5 minutes on my computer
anchors <- FindTransferAnchors(reference=sc_ref, query=sc_subset, dims=1:30)
Performing PCA on the provided reference using 3776 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 3524 anchors
predictions <- TransferData( anchorset = anchors,
refdata = sc_ref@meta.data$rough_annot )
Finding integration vectors
Finding integration vector weights
Predicting cell labels
head(predictions)
predicted.id prediction.score.Monocytes
s1_AAACCTGAGCACCGTC-1 Bcells 0
s1_AAACCTGGTAGAGCTG-1 CD8Tcells 0
s1_AAACCTGGTCTAGCGC-1 CD4Tcells 0
s1_AAACCTGGTGTGGCTC-1 CD8Tcells 0
s1_AAACGGGGTGGTACAG-1 CD8Tcells 0
s1_AAACGGGTCACCCGAG-1 CD8Tcells 0
prediction.score.CD4Tcells prediction.score.CD8Tcells
s1_AAACCTGAGCACCGTC-1 0.0000000 0.00000000
s1_AAACCTGGTAGAGCTG-1 0.3384641 0.58744207
s1_AAACCTGGTCTAGCGC-1 0.9385949 0.03228869
s1_AAACCTGGTGTGGCTC-1 0.1312291 0.80917079
s1_AAACGGGGTGGTACAG-1 0.2367898 0.71659196
s1_AAACGGGTCACCCGAG-1 0.1889408 0.80210806
prediction.score.Bcells prediction.score.NKcells
s1_AAACCTGAGCACCGTC-1 1 0.000000000
s1_AAACCTGGTAGAGCTG-1 0 0.074093840
s1_AAACCTGGTCTAGCGC-1 0 0.029116392
s1_AAACCTGGTGTGGCTC-1 0 0.059600097
s1_AAACGGGGTGGTACAG-1 0 0.046618223
s1_AAACGGGTCACCCGAG-1 0 0.008951116
prediction.score.max
s1_AAACCTGAGCACCGTC-1 1.0000000
s1_AAACCTGGTAGAGCTG-1 0.5874421
s1_AAACCTGGTCTAGCGC-1 0.9385949
s1_AAACCTGGTGTGGCTC-1 0.8091708
s1_AAACGGGGTGGTACAG-1 0.7165920
s1_AAACGGGTCACCCGAG-1 0.8021081
# set cells with low-confidence predictions to NA
cell.types <- predictions$predicted.id
cell.types[predictions$prediction.score.max < 0.9] <- NA
table(cell.types)
cell.types
Bcells CD4Tcells CD8Tcells Monocytes NKcells
218 1325 146 243 73
# add these cell types to original data and plot in UMAP
# also make plots with some of the marker genes
sc_subset@meta.data$CellType <- cell.types
DimPlot(sc_subset, group.by = "CellType")
# compare to the clustering UMAP plot and the dotplot
DimPlot(sc_subset, reduction='umap', label=T)
DotPlot(sc_subset, features=marker_list$ID, group.by="RNA_snn_res.0.5" ) +
RotatedAxis() +
scale_x_discrete(labels=function(x) marker_list[x,"description"])
# summary statistics of cell types per cluster
table(sc_subset@meta.data[,c("RNA_snn_res.0.5","CellType")])
CellType
RNA_snn_res.0.5 Bcells CD4Tcells CD8Tcells Monocytes NKcells
0 0 340 0 0 0
1 0 554 0 0 0
2 0 381 0 0 0
3 0 49 146 0 1
4 0 0 0 243 0
5 218 0 0 0 0
6 0 1 0 0 72
saveRDS(sc_subset, file="sc_subset.rds")
If you closed R, then reload the sc_subset Seurat
object. Otherwise skip this step.
library(Seurat)
sc_subset <- readRDS("sc_subset.rds")
If you closed R and forgot to save your Seurat object, you can
read it from our server. Otherwise skip this step. NOTE
Use the url() function when reading an RDS from the
web.
library(Seurat)
sc_subset <- readRDS(url("https://wd.cri.uic.edu/sc_rna/sc_subset.rds"))
First we will run differential expression between clusters using the Wilcox test. Only test genes that are expressed in at least 20% of cells (default is 1%).
degs <- FindAllMarkers(sc_subset, test.use="wilcox", min.pct=0.2)
Calculating cluster 0
Calculating cluster 1
Calculating cluster 2
Calculating cluster 3
Calculating cluster 4
Calculating cluster 5
Calculating cluster 6
head(degs)
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster
ENSG00000160789 1.546991e-115 2.7297095 0.527 0.119 5.064540e-111 0
ENSG00000150093 2.372247e-79 1.8822769 0.532 0.166 7.766262e-75 0
ENSG00000008517 2.526313e-73 0.9175969 0.958 0.703 8.270645e-69 0
ENSG00000227507 5.156198e-72 0.8962100 0.983 0.817 1.688036e-67 0
ENSG00000168685 4.845478e-56 0.9567066 0.860 0.551 1.586312e-51 0
ENSG00000204389 7.040733e-51 0.9275783 0.886 0.645 2.304995e-46 0
gene
ENSG00000160789 ENSG00000160789
ENSG00000150093 ENSG00000150093
ENSG00000008517 ENSG00000008517
ENSG00000227507 ENSG00000227507
ENSG00000168685 ENSG00000168685
ENSG00000204389 ENSG00000204389
Add in gene symbols, which we can find in any of the features.tsv.gz files from CellRanger.
genenames <- read.delim("~/Downloads/V035_F031_subsample/features.tsv.gz", row.names=1)
degs$symbol = genenames[degs$gene, 1]
head(degs)
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster
ENSG00000160789 1.546991e-115 2.7297095 0.527 0.119 5.064540e-111 0
ENSG00000150093 2.372247e-79 1.8822769 0.532 0.166 7.766262e-75 0
ENSG00000008517 2.526313e-73 0.9175969 0.958 0.703 8.270645e-69 0
ENSG00000227507 5.156198e-72 0.8962100 0.983 0.817 1.688036e-67 0
ENSG00000168685 4.845478e-56 0.9567066 0.860 0.551 1.586312e-51 0
ENSG00000204389 7.040733e-51 0.9275783 0.886 0.645 2.304995e-46 0
gene symbol
ENSG00000160789 ENSG00000160789 LMNA
ENSG00000150093 ENSG00000150093 ITGB1
ENSG00000008517 ENSG00000008517 IL32
ENSG00000227507 ENSG00000227507 LTB
ENSG00000168685 ENSG00000168685 IL7R
ENSG00000204389 ENSG00000204389 HSPA1A
Count the number of genes with p_val_adj < 0.05 and
at least a two-fold change for each cluster.
table(degs[degs$p_val_adj<0.05 &
abs(degs$avg_log2FC)>1, "cluster"])
0 1 2 3 4 5 6
48 88 137 64 966 358 337
Get top 5 up-regulated genes per cluster. The group_by()
function will group the data.frame rows by the cluster
column and the top_n() function will return the top n (10)
rows ranked by the column myAUC.
NOTE The %>% is a “pipe” function
that pass the output from the each function to the next function.
library(dplyr)
top5 <- degs %>% group_by(cluster) %>% top_n(n=5, wt=avg_log2FC)
top5
# A tibble: 35 × 8
# Groups: cluster [7]
p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene symbol
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
1 1.55e-115 2.73 0.527 0.119 5.06e-111 0 ENSG00000160789 LMNA
2 2.37e- 79 1.88 0.532 0.166 7.77e- 75 0 ENSG00000150093 ITGB1
3 1.48e- 50 1.57 0.446 0.164 4.83e- 46 0 ENSG00000165272 AQP3
4 9.55e- 36 1.40 0.363 0.142 3.13e- 31 0 ENSG00000169756 LIMS1
5 1.27e- 17 1.40 0.303 0.153 4.17e- 13 0 ENSG00000090104 RGS1
6 7.87e- 81 2.14 0.489 0.146 2.58e- 76 1 ENSG00000189283 FHIT
7 2.01e- 48 1.35 0.569 0.282 6.59e- 44 1 ENSG00000126353 CCR7
8 1.79e- 42 2.32 0.24 0.058 5.86e- 38 1 ENSG00000111863 ADTRP
9 4.96e- 38 1.86 0.316 0.109 1.62e- 33 1 ENSG00000182463 TSHZ2
10 6.67e- 18 1.18 0.265 0.124 2.18e- 13 1 ENSG00000072110 ACTN1
# ℹ 25 more rows
These visualizations use the built-in functions from Seurat.
top5_cluster5 = top5[top5$cluster==5,]
VlnPlot(sc_subset, features=top5_cluster5$gene, pt.size=F)
VlnPlot(sc_subset, features=top5_cluster5$gene, split.by = "orig.ident", pt.size=F)
The default behaviour of split.by has changed.
Separate violin plots are now plotted side-by-side.
To restore the old behaviour of a single split violin,
set split.plot = TRUE.
This message will be shown once per session.
DotPlot(sc_subset,features=top5_cluster5$gene) + RotatedAxis()
FeaturePlot(sc_subset, features=top5_cluster5$gene)
NOTE! By default, Seurat will only scale the
variable features identified by FindVariableFeatures().
DoHeatmap(sc_subset, features=top5$gene)
Make nicer versions of these plots, with a little extra work.
Violin plots:
myplots <- VlnPlot(sc_subset, features=top5_cluster5$gene, combine=F)
for(i in 1:length(myplots)){
myplots[[i]] = myplots[[i]] + ggtitle(top5_cluster5$symbol[i])
}
plot_grid(plotlist=myplots)
Dotplots:
DotPlot(sc_subset, features=top5_cluster5$gene ) +
scale_x_discrete(labels=function(x) genenames[x,1])
Heatmap:
Option 1, using Seurat’s DoHeatmap (this will still omit genes that were not z-scaled - not part of the variable features).
DoHeatmap(sc_subset, features=top5$gene) +
scale_y_discrete(labels=function(x) genenames[x,1])
Option 2, using ComplexHeatmap
# get normalized expression data of interest
norm.data <- GetAssayData(sc_subset,layer="data")
norm.data <- norm.data[top5$gene,]
# z-score
norm.data <- t(scale(t(norm.data)))
# order the columns by cluster
columns.clusters <- sc_subset@meta.data$seurat_clusters
columns.order <- order(columns.clusters)
norm.data <- norm.data[,columns.order]
columns.clusters <- columns.clusters[columns.order]
# make heatmap
library(ComplexHeatmap)
library(circlize)
col_fun <- colorRamp2(breaks=c(-2,0,2),colors=c("blue","white","red"))
column_annotation <- HeatmapAnnotation(cluster=columns.clusters)
Heatmap(norm.data, name="Z-score", col=col_fun,
cluster_columns=F, show_column_names=F, column_split = columns.clusters,
cluster_rows=F, row_labels=top5$symbol)
FindAllMarkers) to test specific subsets of clusters.orig.ident.cluster1vs2 <- FindMarkers(sc_subset, ident.1 = 1, ident.2 = 2, group.by="seurat_clusters")
cluster1vs2and3 <- FindMarkers(sc_subset, ident.1 = 1, ident.2 = c(2,3), group.by="seurat_clusters")
Idents(sc_subset) = "CellType"
stats_between_types <- FindAllMarkers(sc_subset, test.use="wilcox")
Calculating cluster Bcells
Calculating cluster CD4Tcells
Calculating cluster Monocytes
Calculating cluster CD8Tcells
Calculating cluster NKcells
We have two options here:
Differential stats on a single-cell level
logfc.threshold = 0 to test ALL genes with
sufficient minimum expression. This avoids biasing the p-values by only
testing genes with a minimum difference.# example analyzing for cluster 1
sc_cluster1 = subset(sc_subset, subset = seurat_clusters == 1)
Idents(sc_cluster1) = "orig.ident"
cluster1_stats <- FindAllMarkers(sc_cluster1, test.use="wilcox",
logfc.threshold = 0)
cluster1_stats$p_val_adj = p.adjust(cluster1_stats$p_val, method="fdr")
Differential stats as pseudo-bulk
sc_cluster1 = subset(sc_subset, subset = seurat_clusters == 1)
samples = unique(sc_cluster1$orig.ident)
counts = GetAssayData(sc_subset,layer="counts")
cluster1_counts = sapply(samples, function(x){
sample.cells = sc_cluster1$orig.ident == x
sample.counts = Matrix::rowSums(counts[,sample.cells])
return(sample.counts)
})
head(cluster1_counts)
sample1 sample2
ENSG00000243485 0 0
ENSG00000237613 0 0
ENSG00000186092 0 0
ENSG00000238009 5 0
ENSG00000239945 0 0
ENSG00000237683 8 0
To run edgeR we would really need to have biological replicates, but this would allow us to correctly factor in biological variability when performing our statistical analysis.
NOTE: You may wish to perform both of the above analyses over ALL clusters. In this case, you can write a loop to perform the same steps over each cluster.
table(sc_subset$seurat_clusters)
0 1 2 3 4 5 6
641 601 549 429 243 233 95
seurat_clusters is the cluster ID and
orig.ident is the sample ID for each cell). The
dnn parameter will allow us to provide a nicer name for the
output from table().cluster_abundance <- table(sc_subset$seurat_clusters,
sc_subset$orig.ident,
dnn=c("cluster", "group"))
cluster_abundance
group
cluster sample1 sample2
0 410 231
1 536 65
2 39 510
3 286 143
4 49 194
5 169 64
6 34 61
cluster_percent <- t(t(cluster_abundance)/colSums(cluster_abundance))
cluster_percent
group
cluster sample1 sample2
0 0.26920552 0.18217666
1 0.35193697 0.05126183
2 0.02560735 0.40220820
3 0.18778726 0.11277603
4 0.03217334 0.15299685
5 0.11096520 0.05047319
6 0.02232436 0.04810726
Take a look at some of the parts of the Seurat package
Browse the Seurat R object through R Studio
head(sc_subset@meta.data$orig.ident)
[1] "sample1" "sample1" "sample1" "sample1" "sample1" "sample1"
head(sc_subset@meta.data$seurat_clusters)
[1] 5 0 1 3 0 3
Levels: 0 1 2 3 4 5 6
head(sc_subset@meta.data$RNA_snn_res.0.5)
[1] 5 0 1 3 0 3
Levels: 0 1 2 3 4 5 6
GetAssayData(sc_subset,layer="scale.data")[1:5,1:5]
s1_AAACCTGAGCACCGTC-1 s1_AAACCTGGTAGAGCTG-1
ENSG00000228327 -0.09095396 -0.09095396
ENSG00000188290 -0.14042283 -0.14042283
ENSG00000187608 -0.57919340 -0.57919340
ENSG00000237330 -0.01892867 -0.01892867
ENSG00000272141 -0.01892867 -0.01892867
s1_AAACCTGGTCTAGCGC-1 s1_AAACCTGGTGTGGCTC-1
ENSG00000228327 -0.09095396 -0.09095396
ENSG00000188290 -0.14042283 -0.14042283
ENSG00000187608 0.80624558 -0.57919340
ENSG00000237330 -0.01892867 -0.01892867
ENSG00000272141 -0.01892867 -0.01892867
s1_AAACGGGGTGGTACAG-1
ENSG00000228327 -0.09095396
ENSG00000188290 -0.14042283
ENSG00000187608 2.85856629
ENSG00000237330 -0.01892867
ENSG00000272141 -0.01892867
head(sc_subset@reductions$umap@cell.embeddings)
umap_1 umap_2
s1_AAACCTGAGCACCGTC-1 15.771058 -0.2154330
s1_AAACCTGGTAGAGCTG-1 -2.339165 0.8274526
s1_AAACCTGGTCTAGCGC-1 -2.656620 -1.8505072
s1_AAACCTGGTGTGGCTC-1 -4.207153 1.3721313
s1_AAACGGGGTGGTACAG-1 -1.847196 3.2677318
s1_AAACGGGTCACCCGAG-1 -2.832709 3.8371325
plot(sc_subset@reductions$umap@cell.embeddings)
We can use the same object, as each resolution will get saved into its own slot.
sc_subset <- FindClusters(sc_subset, resolution=0.1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2791
Number of edges: 110384
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9439
Number of communities: 4
Elapsed time: 0 seconds
sc_subset <- FindClusters(sc_subset, resolution=1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2791
Number of edges: 110384
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8027
Number of communities: 12
Elapsed time: 0 seconds
table(sc_subset@meta.data$RNA_snn_res.0.1)
0 1 2 3
1165 1150 243 233
table(sc_subset@meta.data$RNA_snn_res.0.5)
0 1 2 3 4 5 6
641 601 549 429 243 233 95
table(sc_subset@meta.data$RNA_snn_res.1)
0 1 2 3 4 5 6 7 8 9 10 11
413 380 294 283 251 243 229 227 143 124 109 95
fossil librarylibrary(fossil)
clust0.1 <- sc_subset@meta.data$RNA_snn_res.0.1
clust0.5 <- sc_subset@meta.data$RNA_snn_res.0.5
clust1 <- sc_subset@meta.data$RNA_snn_res.1
adj.rand.index(clust0.1, clust0.5)
[1] 0.6428863
adj.rand.index(clust0.1, clust1)
[1] 0.4831814
adj.rand.index(clust0.5, clust1)
[1] 0.7179299
Exercise for home, skip for today (jump to next page to continue exercise)
NOTE: If you ended up with a large number of clustering results and you wanted to compute pair-wise similarities across all of them using the adjusted rand index, you would approach this by setting up a data frame with all of the clustering results in it across the columns, then running a loop over all pairs of columns.:
# building a data frame for our 3 clustering results
cluster.df <- data.frame(clust0.1, clust0.5, clust1)
# define a function to make a similarity matrix
cluster_similarity <- function( clust_df ){
# number of columns we're comparing
columns <- ncol(clust_df)
# set up a similarity matrix
rand.sim <- matrix( nrow=columns, ncol=columns )
rownames(rand.sim) = colnames(clust_df)
colnames(rand.sim) = colnames(clust_df)
# set all values to be 1 first
# this will keep the diagonal entries 1 after we do the other calculations
rand.sim[,] <- 1
# loop over all pairs of columns
for( i in 1:(columns-1) ){
for( j in (i+1):columns ){
# compute the similarity and store it at positions i,j and j,i
sim <- adj.rand.index( clust_df[,i], clust_df[,j] )
rand.sim[i,j] <- sim
rand.sim[j,i] <- sim
}
}
return(rand.sim)
}
# run the function
cluster.sim <- cluster_similarity(cluster.df)
cluster.sim
The goal is to compare the actual sets of cells in each clustering result, so that we can see which clusters between the results are related to each other. As a first pass, we can evaluate this qualitatively with two UMAP plots.
DimPlot(sc_subset, reduction='umap', group.by="RNA_snn_res.0.1", label=T)
DimPlot(sc_subset, reduction='umap', group.by="RNA_snn_res.1", label=T)
Quantitative comparison:
#), but
they are there to explain our steps along the way.Source this function to save time:
source("https://wd.cri.uic.edu/sc_rna/overlap_index.R")
Here is its definition, if you wanted to write it out:
# function for overlap index
overlap_index <- function( cluster1, cluster2 ){
# make factor vectors from clusters
c1 = as.factor(cluster1)
c2 = as.factor(cluster2)
# define a set of "names" for our cells, which we'll compare between clusters
# the names are arbitraty, so we'll just number them
names = 1:length(c1)
# make distance matrix to store our calculations in
my_dist = matrix(nrow=length(levels(c1)),ncol=length(levels(c2)))
rownames(my_dist) = levels(c1)
colnames(my_dist) = levels(c2)
for(i in 1:length(levels(c1))){
for(j in 1:length(levels(c2))){
# get the list of cells in each cluster
c1.sub = names[c1==levels(c1)[i]]
c2.sub = names[c2==levels(c2)[j]]
# get the intersection and min cluster size
int = length(intersect(c1.sub,c2.sub))
minsize = min(length(c1.sub),length(c2.sub))
# overlap index
overlap = int/minsize
# store in matrix
my_dist[i,j] = overlap
}
}
return(my_dist)
}
Now run the function
res_0.1vs1 <- overlap_index(clust0.1, clust1)
res_0.1vs1
0 1 2 3 4 5 6 7 8 9 10 11
0 0.98789346 0.005263158 1 0 0.007968127 0 0.004366812 0.969163 1 0 0 1
1 0.01210654 0.994736842 0 1 0.992031873 0 0.995633188 0.030837 0 0 0 0
2 0.00000000 0.000000000 0 0 0.000000000 1 0.000000000 0.000000 0 0 0 0
3 0.00000000 0.000000000 0 0 0.000000000 0 0.000000000 0.000000 0 1 1 0
library(ComplexHeatmap)
Heatmap(res_0.1vs1,
col = c("white","yellow","red"),
name = "Overlap Index",
column_title = "Resolution 1",
row_title = "Resolution 0.1")
NOTE: DO NOT PERFORM THE FOLLOWING COMMANDS
This is an example of the R commands to load 10X data with both gene expression (GEX) and feature barcoding (FBC) data into Seurat. For this workshop we will be providing a loaded, filtered, and clustered R data object (.rds file). So, skip ahead to Basic visualization of FBC data for the actual exercise.
Read10X
function.data_w_fbc <- Read10X("Sample_A_matrix/", gene.column=1)
NOTE: When the data are loaded by Read10X, it will create a
R list with elements for both the gene expression (GEX)
data and the feature barcoding (FBC) data. You can use the following
command to view the elements in this list. This is not necessary if you
already know the type of FBC data included.
names(data_w_fbc)
sc_w_fbc <- CreateSeuratObject(counts = data_w_fbc[['Gene Expression']])
sc_w_fbc[["Protein"]] <- CreateAssayObject(counts = data_w_fbc[["Antibody Capture"]])
For this exercise we will be providing a loaded, filtered, and clustered R data object. The original data are an example dataset from 10X (https://www.10xgenomics.com/resources/datasets/human-pbmc-from-a-healthy-donor-1-k-cells-v-2-2-standard-4-0-0).
For this Seurat object we had performed the following.
nFeature_RNA > 1000 & nCount_RNA > 3000 & percent.MT < 10.NormalizeData().FindVariableFeatures().RunPCA().FindNeighbors() using the first 10 PCA
dimensions.FindClusters() with a resolution
of 1.sc_w_fbc <- readRDS(url("https://wd.cri.uic.edu/sc_rna/sc_fbc.rds"))
DimPlot(sc_w_fbc, reduction='umap', label=T)
3.
List the features included in the FBC (Protein) assay.
row.names(sc_w_fbc[["Protein"]])
[1] "CD3" "CD19" "CD45RA" "CD4" "CD8a"
[6] "CD14" "CD16" "CD56" "CD25" "CD45RO"
[11] "PD-1" "TIGIT" "IgG1" "IgG2a" "IgG2b"
[16] "CD127" "CD15" "CD197-(CCR7)" "HLA-DR"
. Generate violin plots including FBC data and grouped by cluster.
VlnPlot(sc_w_fbc,
features = c("nFeature_RNA","nCount_RNA", "nCount_Protein"),
pt.size=0.2)
5.
Normalize the FBC data using a center log ratio (CLR) normalization.
sc_w_fbc <- NormalizeData(sc_w_fbc, assay="Protein", normalization.method="CLR")
Normalizing across features
DefaultAssay(sc_w_fbc)
[1] "RNA"
DefaultAssay(sc_w_fbc) <- "Protein"
DefaultAssay(sc_w_fbc)
[1] "Protein"
row.names(sc_w_fbc[["Protein"]])
[1] "CD3" "CD19" "CD45RA" "CD4" "CD8a"
[6] "CD14" "CD16" "CD56" "CD25" "CD45RO"
[11] "PD-1" "TIGIT" "IgG1" "IgG2a" "IgG2b"
[16] "CD127" "CD15" "CD197-(CCR7)" "HLA-DR"
sel_ab <- c("CD3", "CD4", "CD8a", "IgG1")
FeaturePlot(sc_w_fbc, features=sel_ab, label=T)
8. Create dot plot of FBC features
DotPlot(sc_w_fbc, features=sel_ab)
VlnPlot(sc_w_fbc, features=sel_ab)
sc_w_fbc <- ScaleData(sc_w_fbc, assay="Protein")
Centering and scaling data matrix
DoHeatmap(sc_w_fbc, features=sel_ab)
DoHeatmap(sc_w_fbc, features=row.names(sc_w_fbc[["Protein"]]))
sel_features <- c("protein_CD4", "rna_ENSG00000010610",
"protein_CD8a", "rna_ENSG00000153563")
FeaturePlot(sc_w_fbc, features = sel_features, label=T)
3. Generate dot plots.
DotPlot(sc_w_fbc, features = sel_features) + RotatedAxis()
4. Generate violin plots.
VlnPlot(sc_w_fbc, features = sel_features, ncol=2)
fbc_presence <- t(sc_w_fbc[["Protein"]]@data > 1)
cluster_ids <- data.frame(Cluster=sc_w_fbc$seurat_clusters)
fbc_presence <- merge(cluster_ids, fbc_presence, by.x=0, by.y=0)
cd4_counts <- table(fbc_presence[,c("Cluster", "CD4")])
cd4_counts
CD4
Cluster FALSE TRUE
0 10 213
1 55 137
2 6 87
3 91 0
4 80 4
5 37 1
6 31 6
7 31 0
8 7 9
cd4_counts_df <- as.data.frame(cd4_counts)
cd4_counts <- merge(subset(cd4_counts_df, CD4 == "TRUE", select=c(Cluster, Freq)),
subset(cd4_counts_df, CD4 == "FALSE", select=c(Cluster, Freq)),
by.x=1, by.y=1)
colnames(cd4_counts)[2:3] <- c("Present", "Absent")
cd4_counts$Fraction <- cd4_counts$Present / ( cd4_counts$Present + cd4_counts$Absent )
cd4_counts
Cluster Present Absent Fraction
1 0 213 10 0.95515695
2 1 137 55 0.71354167
3 2 87 6 0.93548387
4 3 0 91 0.00000000
5 4 4 80 0.04761905
6 5 1 37 0.02631579
7 6 6 31 0.16216216
8 7 0 31 0.00000000
9 8 9 7 0.56250000
Information about the format of the
filtered_contig_annotations.csv file can be found at https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation#contig.
For this exercise, the following contig annotations were provided.
| Column | Description |
|---|---|
| barcode | Cell-barcode for this contig. |
| is_cell | True or False value indicating whether the barcode was called as a cell. |
| contig_id | Unique identifier for this contig. |
| high_confidence | True or False value indicating whether the contig was called as high-confidence (unlikely to be a chimeric sequence or some other artifact). |
| length | The contig sequence length in nucleotides. |
| chain | The chain associated with this contig; for example, TRA, TRB, IGK, IGL, or IGH. A value of “Multi” indicates that segments from multiple chains were present. |
| v_gene | The highest-scoring V segment, for example, TRAV1-1. |
| d_gene | The highest-scoring D segment, for example, TRBD1. |
| j_gene | The highest-scoring J segment, for example, TRAJ1-1. |
| c_gene | The highest-scoring C segment, for example, TRAC. |
| full_length | If the contig was declared as full-length. |
| productive | If the contig was declared as productive. |
| cdr3 | The predicted CDR3 amino acid sequence. |
| cdr3_nt | The predicted CDR3 nucleotide sequence. |
| reads | The number of reads aligned to this contig. |
| umis | The number of distinct UMIs aligned to this contig. |
| raw_clonotype_id | The ID of the clonotype to which this cell barcode was assigned. |
| raw_consensus_id | The ID of the consensus sequence to which this contig was assigned. |
We will start with a very basic analysis: check which cells have TRA or TRB chains. Depending on the goals of your project, you will also want to dig into the specific CDR3 sequences to compare between cells or between clusters.
Load the contig annotations into R.
sc_vdj_contigs <- read.csv("https://wd.cri.uic.edu/sc_rna/tcr_contigs.csv")
Clean up the barcode IDs, so they match the IDs in the Seurat object (missing the trailing numbers).
sc_vdj_contigs$barcode <- sub('-[0-9]+$', '', sc_vdj_contigs$barcode)
Check how many cells have TRA or TRB or both.
# start with all of the cells
all_cells <- sc_w_fbc@meta.data["seurat_clusters"]
all_cells$TRA <- rownames(all_cells) %in% sc_vdj_contigs$barcode[sc_vdj_contigs$chain=="TRA"]
all_cells$TRB <- rownames(all_cells) %in% sc_vdj_contigs$barcode[sc_vdj_contigs$chain=="TRB"]
all_cells$TRAB <- all_cells$TRA & all_cells$TRB
head(all_cells)
seurat_clusters TRA TRB TRAB
AAACCTGCAGCCTGTG 6 FALSE FALSE FALSE
AAACGGGTCGCTGATA 2 TRUE TRUE TRUE
AAAGCAAAGTATGACA 2 TRUE TRUE TRUE
AAATGCCCACTTAAGC 0 TRUE TRUE TRUE
AACACGTCAACAACCT 0 TRUE TRUE TRUE
AACACGTCAAGCGATG 1 FALSE FALSE FALSE
Summary stats of complete A and B chains per cluster.
table(all_cells[,c("seurat_clusters","TRAB")])
TRAB
seurat_clusters FALSE TRUE
0 19 204
1 191 1
2 5 88
3 9 82
4 84 0
5 4 34
6 12 25
7 31 0
8 16 0
NOTE: This exercise will extract the UMAP plot data from the Seurat object and create a custom plot. It is also possible to add the TCR data to the Seurat object as a secondary assay, akin to the FBC data, and use the Seurat tools.
Get the UMAP coordinates as a data.frame.
umap_coords <- as.data.frame(Embeddings(sc_w_fbc, reduction="umap"))
# Take a peek at the results
head(umap_coords)
umap_1 umap_2
AAACCTGCAGCCTGTG -7.553978 2.770488
AAACGGGTCGCTGATA -5.912212 -1.226433
AAAGCAAAGTATGACA -6.879341 -0.258912
AAATGCCCACTTAAGC -6.659776 -5.674999
AACACGTCAACAACCT -4.886473 -5.689088
AACACGTCAAGCGATG 9.418021 -2.538629
Merge the UMAP coordinates and chain information.
plot_data <- merge(all_cells, umap_coords, by=0)
# Take a peek at the results
head(plot_data)
Row.names seurat_clusters TRA TRB TRAB umap_1 umap_2
1 AAACCTGCAGCCTGTG 6 FALSE FALSE FALSE -7.553978 2.770488
2 AAACGGGTCGCTGATA 2 TRUE TRUE TRUE -5.912212 -1.226433
3 AAAGCAAAGTATGACA 2 TRUE TRUE TRUE -6.879341 -0.258912
4 AAATGCCCACTTAAGC 0 TRUE TRUE TRUE -6.659776 -5.674999
5 AACACGTCAACAACCT 0 TRUE TRUE TRUE -4.886473 -5.689088
6 AACACGTCAAGCGATG 1 FALSE FALSE FALSE 9.418021 -2.538629
Create the basic plot, UMAP plot colored by chain information. The
addition of
guides(color=guide_legend(override.aes = list(size=3)))
will set the size of the plot character in the legend to be a different
size than the plot characters used in the plot.
library(ggplot2)
ggplot(plot_data, aes(x=umap_1, y=umap_2, color=TRAB)) +
geom_point(size=0.5) + theme_classic() +
guides(color=guide_legend(override.aes = list(size=3)))
ggplot(plot_data, aes(x=umap_1, y=umap_2, color=seurat_clusters)) +
geom_point(size=0.5) + theme_classic() +
guides(color=guide_legend(override.aes = list(size=3)))
NOTE: Additional exercise of incorporating V(D)J data into Seurat is found in Extra (Take Home) Exercises.
Seurat (needed to manipulate the previously
create Seurat object) and monocle for the pseudotime
analysis# load both libraries, if not done already
library(Seurat)
library(monocle)
sc_subset <- readRDS("sc_subset.rds")
Monocle has a function called “importCDS” that is supposed to import from Seurat. But it doesn’t work. Fun. We have created a fixed version of the function, which you can source from public-data.
Pro tip: If you want to see the code of a function, just type its name and hit enter. E.g., type “importCDS” to see the code for this function in Monocle2. After you’ve sourced our R script, you can type “importCDS2” to see our version.
# source the R script
source("https://wd.cri.uic.edu/sc_rna/importCDS2.R")
# use the importCDS2 function (our function) instead of importCDS (monocle function)
monocle_data <- importCDS2(sc_subset)
monocle_data
CellDataSet (storageMode: environment)
assayData: 32738 features, 2791 samples
element names: exprs
protocolData: none
phenoData
sampleNames: s1_AAACCTGAGCACCGTC-1 s1_AAACCTGGTAGAGCTG-1 ...
s2_TTTGTCATCCTCAATT-1 (2791 total)
varLabels: orig.ident nCount_RNA ... Size_Factor (10 total)
varMetadata: labelDescription
featureData
featureNames: ENSG00000243485 ENSG00000237613 ... ENSG00000215611
(32738 total)
fvarLabels: gene_short_name
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:
We need to tell Monocle which genes to base the analysis on. Monocle has a set of steps to do this, similar to how we picked the variable genes before. But we can just use the variable genes we’ve already selected.
monocle_data <- estimateSizeFactors(monocle_data)
# get variable genes from Seurat object
features <- sc_subset@assays$RNA@features@.Data
top.genes <- rownames(features)[features[,"scale.data"]]
monocle_data <- setOrderingFilter(monocle_data, top.genes)
monocle_data <- reduceDimension(monocle_data, max_components=2, method='DDRTree')
monocle_data <- orderCells(monocle_data)
plot_cell_trajectory(monocle_data)
plot_cell_trajectory(monocle_data, color_by = "Pseudotime")
plot_cell_trajectory(monocle_data, color_by = "RNA_snn_res.0.5")
The pData function from the monocle package will return
a data.frame with the computed psuedotime values. As the original Seurat
data were also imported, e.g. clusters, original idents.
head(pData(monocle_data))
orig.ident nCount_RNA nFeature_RNA percent.MT
s1_AAACCTGAGCACCGTC-1 sample1 4236 1105 3.257790
s1_AAACCTGGTAGAGCTG-1 sample1 3116 941 3.498074
s1_AAACCTGGTCTAGCGC-1 sample1 6187 1322 1.729433
s1_AAACCTGGTGTGGCTC-1 sample1 2230 740 3.766816
s1_AAACGGGGTGGTACAG-1 sample1 3038 999 2.797893
s1_AAACGGGTCACCCGAG-1 sample1 3153 1002 4.947669
RNA_snn_res.0.5 seurat_clusters RNA_snn_res.0.1
s1_AAACCTGAGCACCGTC-1 5 9 3
s1_AAACCTGGTAGAGCTG-1 0 6 0
s1_AAACCTGGTCTAGCGC-1 1 7 1
s1_AAACCTGGTGTGGCTC-1 6 8 0
s1_AAACGGGGTGGTACAG-1 0 0 0
s1_AAACGGGTCACCCGAG-1 3 2 0
RNA_snn_res.1 Size_Factor Pseudotime State
s1_AAACCTGAGCACCGTC-1 9 1.0609141 11.6279862 1
s1_AAACCTGGTAGAGCTG-1 6 0.7804080 4.3312184 1
s1_AAACCTGGTCTAGCGC-1 7 1.5495457 0.5021657 1
s1_AAACCTGGTGTGGCTC-1 8 0.5585077 4.9794399 1
s1_AAACGGGGTGGTACAG-1 0 0.7608728 8.5888506 1
s1_AAACGGGTCACCCGAG-1 2 0.7896747 5.1233028 1
# Get the top marker gene for cluster 4
topgene = rownames(degs[degs$cluster==4,])[1]
# plot using gene expression as size of dot
plot_cell_trajectory(monocle_data, color_by = "RNA_snn_res.0.5", markers = topgene)
plot_cell_trajectory(monocle_data, markers = topgene, use_color_gradient=T)
head(t(monocle_data@reducedDimS))
[,1] [,2]
s1_AAACCTGAGCACCGTC-1 4.288248 4.7812101
s1_AAACCTGGTAGAGCTG-1 -1.986593 0.6107187
s1_AAACCTGGTCTAGCGC-1 -4.873036 -1.9120832
s1_AAACCTGGTGTGGCTC-1 -1.545207 1.1164150
s1_AAACGGGGTGGTACAG-1 1.588916 2.9565131
s1_AAACGGGTCACCCGAG-1 -1.426535 1.2003225
clusters <- sc_subset@meta.data$RNA_snn_res.0.5
pseudotime <- pData(monocle_data)$Pseudotime
cluster_vs_time <- data.frame(clusters, pseudotime)
boxplot(pseudotime ~ clusters, data=cluster_vs_time, xlab="Cluster",ylab="Pseudotime")
There is clearly a strong association of cluster vs time. Further tests may be helpful if you want to test the significance of which clusters come earlier or later with respect to pseudotime. For now, we can do an overall test using Kruskal Wallis.
kruskal.test(pseudotime ~ clusters)
Kruskal-Wallis rank sum test
data: pseudotime by clusters
Kruskal-Wallis chi-squared = 2312.6, df = 6, p-value < 2.2e-16
CytoTRACE (Cellular (Cyto) Trajectory Reconstruction Analysis using gene Counts and Expression) is a computational method that predicts the differentiation state of cells from single-cell RNA-sequencing data. More details on this package can be found at https://cytotrace.stanford.edu/
BiocManager (dependency for
CytoTRACE)if (! requireNamespace("BiocManager"))
install.packages("BiocManager")
BiocManager::install("sva", update=F)
If you are using a Linux or macOS system, you can download the installation package using the following command.
wget https://cytotrace.stanford.edu/CytoTRACE_0.3.3.tar.gz
R. Modify the
PATH/TO/DOWNLOAD to reflect the directory where you
download the CytoTRACE installation package.if ( ! requireNamespace("devtools"))
install.packages("devtools")
devtools::install_local("PATH/TO/DOWNLOAD/CytoTRACE_0.3.3.tar.gz")
scanoramaCT and numpy. If you are using a
Linux or macOS system, you can execute the following commands to install
these packagespip install scanoramaCT
pip install numpy
library(Seurat)
library(CytoTRACE,verbose=F)
Welcome to the CytoTRACE R package, a tool for the unbiased prediction of differentiation states in scRNA-seq data. For more information about this method, visit https://cytotrace.stanford.edu.
sc_obj <- readRDS("sc_subset.rds")
sc_obj.1_2 <- subset(sc_obj, subset = RNA_snn_res.0.5 == 1 | RNA_snn_res.0.5 == 2)
sc_counts <- GetAssayData(sc_obj,layer="counts")
cyto_results <- CytoTRACE(as.matrix(sc_counts), enableFast=F)
The number of cells in your dataset is less than 3,000. Fast mode has been disabled.
CytoTRACE will be run on 1 sub-sample(s) of approximately 2791 cells each using 1 / 1 core(s)
Pre-processing data and generating similarity matrix...
Calculating gene counts signature...
Smoothing values with NNLS regression and diffusion...
Calculating genes associated with CytoTRACE...
Done
Warning message:
In CytoTRACE(sc_counts, enableFast = F) :
14919 genes have zero expression in the matrix and were filtered
For this exercise, we will add the results from CytoTRACE back into the Seurat object. This will allow us to use a number of the existing Seurat functions to analyze and visualize the CytoTRACE results.
sc_obj@meta.data$CytoTRACE <- cyto_results$CytoTRACE
Visualize the CytoTRACE time variable on a UMAP plot
FeaturePlot(sc_obj, reduction='umap', features='CytoTRACE')
Differential expression with respect to CytoTRACE time
# Get the count of cells expressing the gene.
# The code sum(x>0) is a shortcut to get a count of items greater than zero (0)
counts_sum = apply(sc_counts, 1, function(x) sum(x>0) )
# Get the genes in which the count is greater than 25% of the number of cells.
genes_keep = counts_sum > ( 0.25 * ncol(sc_counts) )
# Subset the gene counts data to just those genes.
genes_data = GetAssayData(sc_obj,layer="data")[genes_keep,]
# Take a peek at the number of genes (rows) in the filtered data.
dim(genes_data)
[1] 1083 2791
NOTE We will get warnings with the Spearman test about ties.
# Use an apply function to run the Spearman test (correlation) for each row.
# We transpose the results as the apply function will combine the output of each
# iteration column wise and we want to have it by row and save as a data.frame
cytotrace_corr <- data.frame(t(apply(genes_data, 1, function (x) {
res <- cor.test(x, sc_obj@meta.data$CytoTRACE, method="spearman")
return(c(res$estimate, res$p.value))
})))
# Set the column names
colnames(cytotrace_corr) <- c("estimate", "PValue")
# Add FDR corrected PValue
cytotrace_corr$QValue <- p.adjust(cytotrace_corr$PValue, method="BH")
# Sort by absolute value of the correlation estimate
cytotrace_corr <- cytotrace_corr[order(abs(cytotrace_corr$estimate), decreasing = T), ]
# Peek at the first few results
head(cytotrace_corr)
estimate PValue QValue
ENSG00000196154 0.5744602 7.531006e-245 8.156079e-242
ENSG00000026025 0.5256078 4.395481e-198 2.380153e-195
ENSG00000163191 0.5219276 7.303591e-195 2.636596e-192
ENSG00000251562 -0.5135285 1.165850e-187 3.156539e-185
ENSG00000197956 0.4900607 1.431803e-168 3.101286e-166
ENSG00000111640 0.4817820 3.485155e-162 6.290706e-160
Differential time with respect to cluster or sample
library(dplyr)
sc_obj@meta.data %>%
group_by(RNA_snn_res.0.5, orig.ident) %>%
summarize(mean=mean(CytoTRACE), sd=sd(CytoTRACE))
`summarise()` has grouped output by 'RNA_snn_res.0.5'. You can override using
the `.groups` argument.
# A tibble: 14 × 4
# Groups: RNA_snn_res.0.5 [7]
RNA_snn_res.0.5 orig.ident mean sd
<fct> <chr> <dbl> <dbl>
1 0 sample1 0.716 0.167
2 0 sample2 0.613 0.205
3 1 sample1 0.399 0.220
4 1 sample2 0.212 0.216
5 2 sample1 0.381 0.184
6 2 sample2 0.325 0.191
7 3 sample1 0.496 0.251
8 3 sample2 0.433 0.218
9 4 sample1 0.970 0.0217
10 4 sample2 0.953 0.0255
11 5 sample1 0.274 0.326
12 5 sample2 0.412 0.316
13 6 sample1 0.505 0.229
14 6 sample2 0.400 0.207
library(ggplot2)
ggplot(sc_obj@meta.data, aes(x=orig.ident, y= CytoTRACE, fill=orig.ident)) +
theme_classic() +
geom_boxplot() +
facet_wrap(~ RNA_snn_res.0.5) +
labs(x="", y="CytoTRACE time", fill="Sample")
kruskal.test(CytoTRACE ~ RNA_snn_res.0.5, data = sc_obj@meta.data)
Kruskal-Wallis rank sum test
data: CytoTRACE by RNA_snn_res.0.5
Kruskal-Wallis chi-squared = 1254.9, df = 6, p-value < 2.2e-16
sc_obj@meta.data %>% group_by(RNA_snn_res.0.5) %>%
do(w=wilcox.test(CytoTRACE ~ orig.ident, data=.)) %>%
summarize(ClusterID=RNA_snn_res.0.5, PValue=w$p.value)
# A tibble: 7 × 2
ClusterID PValue
<fct> <dbl>
1 0 1.55e-10
2 1 2.77e-10
3 2 4.29e- 2
4 3 6.78e- 3
5 4 3.67e- 5
6 5 1.05e- 3
7 6 3.00e- 2NOTE: The first five (5) steps are the same as in exercise 2.5.1.
sc_vdj_contigs <- read.csv("https://wd.cri.uic.edu/sc_rna/tcr_contigs.csv")
sc_vdj_contigs$barcode <- sub('-[0-9]+$', '', sc_vdj_contigs$barcode)
# First get a list of the detected chains
table(sc_vdj_contigs$chain)
Multi TRA TRB
6 578 614
# Then compute the chain counts per cell
vdj_chain_counts <- as.data.frame(table(sc_vdj_contigs[, c("barcode", "chain")]))
vdj_chain_counts$presence <- vdj_chain_counts$Freq > 0
# Take a peek at the results
head(vdj_chain_counts)
barcode chain Freq presence
1 AAACGGGTCGCTGATA Multi 0 FALSE
2 AAAGCAAAGTATGACA Multi 0 FALSE
3 AAATGCCCACTTAAGC Multi 0 FALSE
4 AACACGTCAACAACCT Multi 0 FALSE
5 AACACGTGTTATCCGA Multi 0 FALSE
6 AACACGTGTTATCGGT Multi 0 FALSE
cluster_ids <- data.frame(cluster=Idents(sc_w_fbc))
# Take a peek at the results
head(cluster_ids)
cluster
AAACCTGCAGCCTGTG 6
AAACGGGTCGCTGATA 2
AAAGCAAAGTATGACA 2
AAATGCCCACTTAAGC 0
AACACGTCAACAACCT 0
AACACGTCAAGCGATG 1
dcast function in the reshape2 library.library(reshape2)
vdj_cell_chains <- dcast(vdj_chain_counts[, c("barcode", "chain", "Freq")],
barcode ~ chain)
Using Freq as value column: use value.var to override.
dim(vdj_cell_chains)
[1] 482 4
head(vdj_cell_chains)
barcode Multi TRA TRB
1 AAACGGGTCGCTGATA 0 1 1
2 AAAGCAAAGTATGACA 0 1 1
3 AAATGCCCACTTAAGC 0 1 1
4 AACACGTCAACAACCT 0 2 3
5 AACACGTGTTATCCGA 0 2 1
6 AACACGTGTTATCGGT 0 1 1
vdj_cell_chains <- merge(cluster_ids, vdj_cell_chains,
by.x=0, by.y=1, all.x=T)
dim(vdj_cell_chains)
[1] 805 5
row.names(vdj_cell_chains) <- vdj_cell_chains[,1]
vdj_cell_chains <- as.matrix(vdj_cell_chains[, c(-1, -2)])
head(vdj_cell_chains)
Multi TRA TRB
AAACCTGCAGCCTGTG NA NA NA
AAACGGGTCGCTGATA 0 1 1
AAAGCAAAGTATGACA 0 1 1
AAATGCCCACTTAAGC 0 1 1
AACACGTCAACAACCT 0 2 3
AACACGTCAAGCGATG NA NA NA
NA values to be zero (no chains)vdj_cell_chains[is.na(vdj_cell_chains)] <- 0
NOTE: the CreateAssayObject expects a counts
table with features (chains) on the rows and cell on the columns. So, we
will need to transpose the matrix when we create the new assay
object.
sc_w_fbc[["TCR"]] <- CreateAssayObject(counts=t(vdj_cell_chains))
.rds file for repeated use.saveRDS(sc_w_fbc, file="seurat_obj_w_tcr.rds")
DefaultAssay(sc_w_fbc) <- "TCR"
FeaturePlot(sc_w_fbc, features = c("TRA", "TRB"), label=T)
DotPlot(sc_w_fbc, features=c("TRA", "TRB"))
DotPlot(sc_w_fbc, features=c("tcr_TRA", "tcr_TRB", "protein_CD4", "rna_ENSG00000010610")) +
RotatedAxis()
This example will use the library vegan to compute
diversity of V(D)J+C annotations as compared with different biological
samples.
NOTE: A non-publically available dataset was used for this example as the previous V(D)J dataset contained only a single sample. Thus, this is “exercise” is demonstration only.
The vegan package is available on CRAN and can be
installed via install.packages.
install.packages('vegan')
sc_obj <- readRDS("sc_obj.rds")
sc_vdj_contigs <- read.delim("https://wd.cri.uic.edu/sc_rna/filtered_contig_annotations.txt")
sc_vdj_contigs$barcode <- sub('-[0-9]+$', '', sc_vdj_contigs$barcode)
cluster_ids <- data.frame(cluster=Idents(sc_obj), sample=sc_obj$orig.ident)
head(cluster_ids)
cluster sample
Sample_001_AAACCTGAGCTGTCTA 14 Sample_001
Sample_001_AAACCTGCAAAGGAAG 29 Sample_001
Sample_001_AAACCTGGTGAGGGTT 9 Sample_001
Sample_001_AAACGGGGTGAACCTT 12 Sample_001
Sample_001_AAACGGGGTTGATTCG 8 Sample_001
Sample_001_AAACGGGTCCTATTCA 13 Sample_001
vdj_data <- subset(sc_vdj_contigs, select=c(barcode, cdr3))
head(vdj_data)
barcode cdr3
1 Sample_001_AAACCTGAGCTGTCTA CASSDLGAGTGQLYF
2 Sample_001_AAACCTGAGCTGTCTA CAVRDPPGNTRKLIF
3 Sample_001_AAACCTGGTGAGGGTT CASRSGTGDYEQYF
4 Sample_001_AACGTTGAGTAGCCGA CALSESGGKLTL
5 Sample_001_AACGTTGAGTAGCCGA CASSLKTGGYAEQFF
6 Sample_001_AACTTTCAGGGAAACA CASSLKTGGYAEQFF
vdj_data <- subset(sc_vdj_contigs, chain=="TRA", select=c(barcode, cdr3))
head(vdj_data)
barcode cdr3
2 Sample_001_AAACCTGAGCTGTCTA CAVRDPPGNTRKLIF
4 Sample_001_AACGTTGAGTAGCCGA CALSESGGKLTL
7 Sample_001_AACTTTCAGGGAAACA CALSESGGKLTL
9 Sample_001_AACTTTCAGTTACCCA CALSDLDYSNNRLTL
11 Sample_001_AACTTTCCATTCTTAC CAASMRGSALGRLHF
14 Sample_001_AAGGAGCGTAAACGCG CAVRASSGQKLVF
vdj_data <- data.frame(barcode=sc_vdj_contigs$barcode,
vdj_annotation=paste(sc_vdj_contigs$v_gene, sc_vdj_contigs$d_gene,
sc_vdj_contigs$j_gene, sc_vdj_contigs$c_gene, sep="."))
head(vdj_data)
barcode vdj_annotation
1 Sample_001_AAACCTGAGCTGTCTA TRBV13-1..TRBJ2-2.TRBC2
2 Sample_001_AAACCTGAGCTGTCTA TRAV1..TRAJ37.TRAC
3 Sample_001_AAACCTGGTGAGGGTT TRBV19.TRBD1.TRBJ2-7.TRBC2
4 Sample_001_AACGTTGAGTAGCCGA TRAV6N-7..TRAJ44.TRAC
5 Sample_001_AACGTTGAGTAGCCGA TRBV29..TRBJ2-1.TRBC2
6 Sample_001_AACTTTCAGGGAAACA TRBV29..TRBJ2-1.TRBC2
tra_data <- subset(sc_vdj_contigs, chain=="TRA")
vdj_data <- data.frame(barcode=tra_data$barcode,
vdj_annotation=paste(tra_data$v_gene, tra_data$d_gene,
tra_data$j_gene, tra_data$c_gene, sep="."))
head(vdj_data)
barcode vdj_annotation
1 Sample_001_AAACCTGAGCTGTCTA TRAV1..TRAJ37.TRAC
2 Sample_001_AACGTTGAGTAGCCGA TRAV6N-7..TRAJ44.TRAC
3 Sample_001_AACTTTCAGGGAAACA TRAV6N-7..TRAJ44.TRAC
4 Sample_001_AACTTTCAGTTACCCA TRAV6-6..TRAJ7.TRAC
5 Sample_001_AACTTTCCATTCTTAC TRAV9-1..TRAJ18.TRAC
6 Sample_001_AAGGAGCGTAAACGCG TRAV1..TRAJ16.TRAC
vdj_data <- merge(cluster_ids, vdj_data, by.x=0, by.y=1)
head(vdj_data)
Row.names cluster sample vdj_annotation
1 Sample_001_AAACCTGAGCTGTCTA 14 Sample_001 TRBV13-1..TRBJ2-2.TRBC2
2 Sample_001_AAACCTGAGCTGTCTA 14 Sample_001 TRAV1..TRAJ37.TRAC
3 Sample_001_AAACCTGGTGAGGGTT 9 Sample_001 TRBV19.TRBD1.TRBJ2-7.TRBC2
4 Sample_001_AACGTTGAGTAGCCGA 4 Sample_001 TRAV6N-7..TRAJ44.TRAC
5 Sample_001_AACGTTGAGTAGCCGA 4 Sample_001 TRBV29..TRBJ2-1.TRBC2
6 Sample_001_AACTTTCAGGGAAACA 4 Sample_001 TRAV6N-7..TRAJ44.TRAC
library(dplyr)
vdj_counts <- vdj_data %>% group_by(sample, vdj_annotation) %>% summarize(counts=n())
`summarise()` has grouped output by 'sample'. You can override using the
`.groups` argument.
head(vdj_counts)
# A tibble: 6 × 3
# Groups: sample [1]
sample vdj_annotation counts
<chr> <chr> <int>
1 Sample_001 TRAV1..TRAJ16.TRAC 3
2 Sample_001 TRAV1..TRAJ37.TRAC 1
3 Sample_001 TRAV10D..TRAJ26.TRAC 1
4 Sample_001 TRAV10D..TRAJ37.TRAC 1
5 Sample_001 TRAV10D..TRAJ45.TRAC 1
6 Sample_001 TRAV10N..TRAJ7.TRAC 1
library(reshape2)
vdj_counts_mat <- dcast(vdj_counts, sample ~ vdj_annotation, value.var="counts", fill=0)
head(vdj_counts_mat)[,1:4]
sample TRAV1..TRAJ12.TRAC TRAV1..TRAJ16.TRAC TRAV1..TRAJ17.TRAC
1 Sample_001 0 3 0
2 Sample_002 0 0 0
3 Sample_003 0 0 0
4 Sample_004 0 0 0
5 Sample_005 0 0 0
6 Sample_006 0 0 0
# The first column has the sample IDs.
# So, set as the row names and drop the first column when converting to matrix.
row.names(vdj_counts_mat) <- vdj_counts_mat[,1]
vdj_counts_mat <- as.matrix(vdj_counts_mat[,-1, drop=F])
head(vdj_counts_mat)[,1:3]
TRAV1..TRAJ12.TRAC TRAV1..TRAJ16.TRAC TRAV1..TRAJ17.TRAC
Sample_001 0 3 0
Sample_002 0 0 0
Sample_003 0 0 0
Sample_004 0 0 0
Sample_005 0 0 0
Sample_006 0 0 0
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.6-6.1
Attaching package: 'vegan'
The following object is masked from 'package:VGAM':
calibrate
vdj_diversity <- data.frame(sample=row.names(vdj_counts_mat),
S=diversity(vdj_counts_mat, index="shannon"),
N=specnumber(vdj_counts_mat))
head(vdj_diversity)
sample S N
Sample_001 Sample_001 4.640368 156
Sample_002 Sample_002 4.087640 90
Sample_003 Sample_003 5.001865 220
Sample_004 Sample_004 4.956640 292
Sample_005 Sample_005 5.119334 299
Sample_006 Sample_006 4.551837 199
Once the diversity indices are generated these can be compared with the experimental factors for your biological samples using basic statistical tests, e.g. Kruskal-Wallis or Mann-Whitney. The diversity indices could also be plotted using dot/box plots as a function of particular experimental factors.
For these example we used the following mapping information
mapping <- read.delim("https://wd.cri.uic.edu/sc_rna/mapping.txt")
head(mapping)
Sample Group
1 Sample_001 A
2 Sample_002 B
3 Sample_003 D
4 Sample_004 C
5 Sample_005 C
6 Sample_006 C
vdj_diversity <- merge(vdj_diversity, mapping, by.x=1, by.y=1)
# Take a peek at the results
head(vdj_diversity)
sample S N Group
1 Sample_001 4.640368 156 A
2 Sample_002 4.087640 90 B
3 Sample_003 5.001865 220 D
4 Sample_004 4.956640 292 C
5 Sample_005 5.119334 299 C
6 Sample_006 4.551837 199 C
# Comparing difference in computed Shannon entropy (S) with group
kruskal.test(S ~ Group, vdj_diversity)
Kruskal-Wallis rank sum test
data: S by Group
Kruskal-Wallis chi-squared = 7.2051, df = 3, p-value = 0.06564
# Comparing difference in richness (N) with group
kruskal.test(N ~ Group, vdj_diversity)
Kruskal-Wallis rank sum test
data: N by Group
Kruskal-Wallis chi-squared = 7.2051, df = 3, p-value = 0.06564
boxplot(S ~ Group, vdj_diversity)
boxplot(N ~ Group, vdj_diversity)
ggplot2library(ggplot2)
ggplot(vdj_diversity, aes(x=Group, y=S)) + geom_boxplot()
ggplot(vdj_diversity, aes(x=Group, y=N)) + geom_boxplot()
NOTE: It may be important to subsample/rarefy the counts data to account for any diversity indices that may be due to the fact that some samples had higher cell counts than others. The following is an example of subsample the data to 500 counts per sample.
vdj_counts_200 <- rrarefy(vdj_counts_mat, 200)
Warning in rrarefy(vdj_counts_mat, 200): some row sums < 'sample' and are not
rarefied
# Filter out any samples that were not subsampled to a depth of 200 (have less than 200 counts)
vdj_counts_200 <- vdj_counts_200[ rowSums(vdj_counts_200) == 200, ]
# Compute the diversity indices for each sample
# Will compute Shannon entropy (S) and richness, a.k.a. number of unique entities (N)
vdj_diversity <- data.frame(sample=row.names(vdj_counts_200),
S=diversity(vdj_counts_200, index="shannon"),
N=specnumber(vdj_counts_200))
head(vdj_diversity)
sample S N
Sample_001 Sample_001 4.396868 105
Sample_003 Sample_003 4.581617 127
Sample_004 Sample_004 4.318617 124
Sample_005 Sample_005 4.501748 119
Sample_006 Sample_006 4.188451 105
Sample_008 Sample_008 4.262527 112